Modul 2 Praktikum PSD 2023/2024 Semester Ganjil

Kembali ke Pengantar Sains Data

Versi file .R dari modul ini bisa diunduh: Modul 2 (REV).R

Visualisasi Tambahan

STEM AND LEAF PLOT

Untuk ngeliat persebaran data (min,max,distribusi), mirip kaya histogram

?stem
View(ChickWeight) #data dari r
stem(ChickWeight$weight)

  The decimal point is 1 digit(s) to the right of the |

   2 | 599999999
   4 | 00000111111111111111111112222222222222223333456678888888899999999999+38
   6 | 00111111122222222333334444455555666677777888888900111111222222333334+8
   8 | 00112223344444455555566777788999990001223333566666788888889
  10 | 0000111122233333334566667778889901122223445555667789
  12 | 00002223333344445555667788890113444555566788889
  14 | 11123444455556666677788890011234444555666777777789
  16 | 00002233334444466788990000134445555789
  18 | 12244444555677782225677778889999
  20 | 0123444555557900245578
  22 | 0012357701123344556788
  24 | 08001699
  26 | 12344569259
  28 | 01780145
  30 | 355798
  32 | 12712
  34 | 1
  36 | 13
hist(ChickWeight$weight) #buat perbandingan

min(ChickWeight$weight) #cek nilai minimumnya
[1] 35
max(ChickWeight$weight) #cek maxnya
[1] 373

Kalau ternyata min sama maxnya kurang tepat, ganti scalenya

stem(ChickWeight$weight, scale = 5) 

  The decimal point is 1 digit(s) to the right of the |

   3 | 599999999
   4 | 000001111111111111111111122222222222222233334
   4 | 5667888888889999999999999
   5 | 00000011111111222233333444
   5 | 5555566667778888899999
   6 | 001111111222222223333344444
   6 | 555556666777778888889
   7 | 001111112222223333344444444
   7 | 6667778889999
   8 | 001122233444444
   8 | 5555556677778899999
   9 | 0001223333
   9 | 566666788888889
  10 | 0000111122233333334
  10 | 5666677788899
  11 | 0112222344
  11 | 5555667789
  12 | 0000222333334444
  12 | 555566778889
  13 | 0113444
  13 | 555566788889
  14 | 111234444
  14 | 5555666667778889
  15 | 0011234444
  15 | 555666777777789
  16 | 000022333344444
  16 | 6678899
  17 | 000013444
  17 | 5555789
  18 | 12244444
  18 | 55567778
  19 | 222
  19 | 5677778889999
  20 | 0123444
  20 | 5555579
  21 | 0024
  21 | 5578
  22 | 00123
  22 | 577
  23 | 01123344
  23 | 556788
  24 | 0
  24 | 8
  25 | 001
  25 | 699
  26 | 12344
  26 | 569
  27 | 2
  27 | 59
  28 | 01
  28 | 78
  29 | 014
  29 | 5
  30 | 3
  30 | 5579
  31 | 
  31 | 8
  32 | 12
  32 | 7
  33 | 12
  33 | 
  34 | 1
  34 | 
  35 | 
  35 | 
  36 | 1
  36 | 
  37 | 3
stem(ChickWeight$weight, width = 100) 

  The decimal point is 1 digit(s) to the right of the |

   2 | 599999999
   4 | 0000011111111111111111111222222222222222333345667888888889999999999999000000111111112222+18
   6 | 0011111112222222233333444445555566667777788888890011111122222233333444444446667778889999
   8 | 00112223344444455555566777788999990001223333566666788888889
  10 | 0000111122233333334566667778889901122223445555667789
  12 | 00002223333344445555667788890113444555566788889
  14 | 11123444455556666677788890011234444555666777777789
  16 | 00002233334444466788990000134445555789
  18 | 12244444555677782225677778889999
  20 | 0123444555557900245578
  22 | 0012357701123344556788
  24 | 08001699
  26 | 12344569259
  28 | 01780145
  30 | 355798
  32 | 12712
  34 | 1
  36 | 13

DOT DIAGRAM

?dotchart
View(mtcars)

Paling Sederhana

dotchart(mtcars$mpg)

Tambah Judul dan Label

dotchart(mtcars$mpg, labels = row.names(mtcars),
         cex = 0.9, xlab = "mpg",
         main = "Persebaran Jarak Yang Dapat Ditempuh Per Galon")

Berdasarkan Grup

grps <- as.factor(mtcars$cyl)
my_cols <- c("blue", "darkgreen", "orange")
dotchart(mtcars$mpg, labels = row.names(mtcars),
         groups = grps, gcolor = my_cols,
         color = my_cols[grps],
         cex = 0.9,  pch = 22, xlab = "mpg",
         main = "Persebaran Jarak Yang Dapat Ditempuh Per Galon Berdasarkan Jumlah Silinder")
legend("bottomright", legend = c("4","6", "6"),
       fill = my_cols, cex = 0.8)

Distribusi Diskirt

plot pdf, cdf, dan data yang dibangkitkan dari distribusi tersebut

Distribusi Diskrit: domainnya (dalam hal ini ruang sampel) hanya bisa diskrit

pdf -> Pr(X=x)

cdf -> Pr(X <= k)

library("Rlab")
Rlab 4.0 attached.

Attaching package: 'Rlab'
The following objects are masked from 'package:stats':

    dexp, dgamma, dweibull, pexp, pgamma, pweibull, qexp, qgamma,
    qweibull, rexp, rgamma, rweibull
The following object is masked from 'package:datasets':

    precip

Bernoulli

PDF

dbern(0, prob = 0.3) #0 adalah domain, atau bisa dikatakan akan dihitung f(0)
[1] 0.7
#parameter prob adalah probabilitas sukses atau f(1)
dbern(1, prob = 0.6)
[1] 0.6

plot pdf

x <- seq(0, 1, by = 1)
plot(dbern(x, prob = 0.6))

visualisasi masih jelek,

improve plotnya coba (terutama perhatiin sumbu x nya) -> bukan domain tapi index doang

plot(x,
     dbern(x, prob = 0.6),
     main = "PDF Distribusi Bernoulli dengan p = 0.6",
     xlab = "x",
     ylab = "f(x)",
     ylim = c(0, 1),
     pch = 20,
     cex = 2)

plot(x,
     dbern(x, prob = 0.6),
     main = "PDF Distribusi Bernoulli dengan p = 0.6",
     xlab = "x",
     ylab = "f(x)",
     ylim = c(0, 1),
     pch = 20,
     cex = 2,
     type = "o")

hindari plot seperti ini dalam distribusi diskrit, kenapa?

balik lagi, domainnya diskrit jadi harusnya ga terdefinisi untuk 0 < x < 1 sehingga harusnya tidak boleh dihubungkan oleh garis lurus

kalo pake garis putus2 masih oke lah, tapi kurang recommend untuk distribusi diskrit:

plot(x,
     dbern(x, prob = 0.6),
     main = "PDF Distribusi Bernoulli dengan p = 0.4",
     xlab = "x",
     ylab = "f(x)",
     ylim = c(0, 1),
     pch = 20,
     cex = 2,
     type = "o", 
     lty = 2)

x2 <- seq(0, 10, by = 1)
plot(x2,
     dbern(x2, prob = 0.6),
     pch = 20,
     cex = 2,
     xaxp = c(0,10,10))

apa kesimpulannya? untuk x = 2,3,4,… f(x) = 0

CDF

pbern(0, prob = 0.6)
[1] 0.4
pbern(1, prob = 0.6)
[1] 1
plot(x2,
     pbern(x2, prob = 0.6),
     pch = 20,
     cex = 2,
     xaxp = c(0,10,10))

apa kesimpulannya? untuk x = 1,2,3,... F(x) = 1 else F(x) = 1-p

Random

bangkitkan n data dari distribusi bernoulli

set.seed(122)
n <- 100
random_bern <- rbern(n, prob = 0.6)
table(random_bern)/sum(table(random_bern))
random_bern
   0    1 
0.42 0.58 
random_bern_plot <- barplot(table(random_bern),
                            ylim = c(0,100))
text(x = random_bern_plot,
     y = table(random_bern),
     labels = table(random_bern),
     pos = 3)

Distribusi Binomial

PDF

dbinom(2,
       size = 10,
       prob = 0.6) #artinya adalah Pr(X=2) dimana X ~ binom(10, 0.6)
[1] 0.01061683
plot(x2,
     dbinom(x2,10,0.4),
     xaxp = c(0,10,10),
     pch = 20,
     ylim = c(0, 0.3)) #plot pdf

CDF

pbinom(2, size = 10, prob = 0.6)
[1] 0.01229455
plot(x2,
     pbinom(x2,10,0.4),
     xaxp = c(0,10,10),
     pch = 20,
     ylim = c(0, 1))

Random

bangkitkan n data dari distribusi binomial

set.seed(122)
n <- 100
random_binom <- rbinom(n, size = 10, prob = 0.6)
table(random_binom)/sum(table(random_binom))
random_binom
   3    4    5    6    7    8    9   10 
0.05 0.14 0.17 0.27 0.21 0.08 0.06 0.02 
random_binom_plot <- barplot(table(random_binom),
                             ylim = c(0,40))
text(x = random_binom_plot,
     y = table(random_binom),
     labels = table(random_binom),
     pos = 3)

Distribusi lain

untuk distribusi lain, intinya tetap sama hanya sesuaikan parameternya saja

format:

  • pdf -> d+nama distribusi()

    misal pdf poisson berarti dpois()

  • cdf poisson: ppois()

  • data random dari distribusi poisson rpois()

selengkapnya bisa cek di dokumentasi

?Distributions